!pip install boto3
Requirement already satisfied: boto3 in /usr/local/lib/python3.10/dist-packages (1.28.54) Requirement already satisfied: botocore<1.32.0,>=1.31.54 in /usr/local/lib/python3.10/dist-packages (from boto3) (1.31.54) Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from boto3) (1.0.1) Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from boto3) (0.6.2) Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.10/dist-packages (from botocore<1.32.0,>=1.31.54->boto3) (2.8.2) Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.10/dist-packages (from botocore<1.32.0,>=1.31.54->boto3) (1.26.16) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.32.0,>=1.31.54->boto3) (1.16.0)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import boto3
import yaml
import plotly.express as px
from matplotlib.ticker import FuncFormatter
import dtale
def number_formatter(number, pos=None):
"""Convert a number into a human readable format."""
magnitude = 0
while abs(number) >= 1000:
magnitude += 1
number /= 1000.0
return '%.1f%s' % (number, ['', 'K', 'M', 'B', 'T', 'Q'][magnitude])
with open("/content/sample_data/credentials.yaml", "r") as f:
config = yaml.safe_load(f)
def cargar_datos_s3(bucket, bucket_path):
session = boto3.Session(
aws_access_key_id = config['s3']['aws_access_key_id'],
aws_secret_access_key = config['s3']['aws_secret_access_key'],
aws_session_token = config['s3']['aws_session_token']
)
s3 = session.resource('s3')
obj = s3.Object(bucket, bucket_path).get()['Body'].read()
dataset = pickle.loads(obj)
return dataset
session = boto3.Session(
aws_access_key_id = config['s3']['aws_access_key_id'],
aws_secret_access_key = config['s3']['aws_secret_access_key'],
aws_session_token = config['s3']['aws_session_token']
)
s3 = session.client('s3')
bucket = "aplicaciones-cd-1-2" + config['iexe']['matricula']
key = "limpieza/"
bucket_path = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents'][-1]['Key']
bucket_path
'limpieza/datos-limpios-2023-09-17.pkl'
dataset = cargar_datos_s3(bucket, bucket_path)
dataset.head()
| inspection_id | dba_name | aka_name | license_ | facility_type | risk | address | city | state | zip | inspection_date | inspection_type | results | latitude | longitude | location | violations | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67757 | DUNKIN DONUTS/BASKIN-ROBBINS | DUNKIN DONUTS/BASKIN-ROBBINS | 1380279 | Restaurant | Risk 2 (Medium) | 100 W RANDOLPH ST | CHICAGO | IL | 60601 | 2010-01-04 | Tag Removal | Pass | 41.884586 | -87.631010 | {'latitude': '41.88458626715456', 'longitude':... | NaN |
| 1 | 104236 | TEMPO CAFE | TEMPO CAFE | 80916 | Restaurant | Risk 1 (High) | 6 E CHESTNUT ST | CHICAGO | IL | 60611 | 2010-01-04 | Canvass | Fail | 41.898431 | -87.628009 | {'latitude': '41.89843137207629', 'longitude':... | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... |
| 2 | 67732 | WOLCOTT'S | TROQUET | 1992039 | Restaurant | Risk 1 (High) | 1834 W MONTROSE AVE | CHICAGO | IL | 60613 | 2010-01-04 | License Re-Inspection | Pass | 41.961606 | -87.675967 | {'latitude': '41.961605669949854', 'longitude'... | NaN |
| 4 | 67733 | WOLCOTT'S | TROQUET | 1992040 | Restaurant | Risk 1 (High) | 1834 W MONTROSE AVE | CHICAGO | IL | 60613 | 2010-01-04 | License Re-Inspection | Pass | 41.961606 | -87.675967 | {'latitude': '41.961605669949854', 'longitude'... | NaN |
| 5 | 52234 | Cafe 608 | Cafe 608 | 2013328 | Restaurant | Risk 1 (High) | 608 W BARRY AVE | CHICAGO | IL | 60657 | 2010-01-04 | License Re-Inspection | Pass | 41.938007 | -87.644755 | {'latitude': '41.938006880423615', 'longitude'... | NaN |
texto en negrita### Los 5 establecimientos con más inspecciones del dataset
inspections_by_type_stablishment = dataset.groupby(['facility_type'], as_index=False)['inspection_id']\
.count()\
.rename(columns={'inspection_id': 'count'})\
.sort_values(by="count", ascending=False)\
.head(5)
inspections_by_type_stablishment
| facility_type | count | |
|---|---|---|
| 393 | Restaurant | 174177 |
| 220 | Grocery Store | 32513 |
| 412 | School | 15971 |
| 114 | Children's Services Facility | 5306 |
| 45 | Bakery | 3768 |
top_10= dataset.groupby('facility_type')['inspection_type'].value_counts().nlargest(10)
print(top_10)
facility_type inspection_type
Restaurant Canvass 91116
Canvass Re-Inspection 19282
License 19239
Complaint 19187
Grocery Store Canvass 13375
School Canvass 12279
Restaurant Complaint Re-Inspection 7714
License Re-Inspection 6182
Short Form Complaint 6129
Grocery Store License 5608
Name: inspection_type, dtype: int64
dataset.columns
Index(['inspection_id', 'dba_name', 'aka_name', 'license_', 'facility_type',
'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
'inspection_type', 'results', 'latitude', 'longitude', 'location',
'violations', 'year_inspection'],
dtype='object')
sns.set_theme(style='darkgrid', context='paper') #style= dark, darkgrid,white, whitegrid,thicks #context= paper, notebook, talk, poster
sns.relplot(data=dataset)
plt.show()
sns.scatterplot(x='risk', y='results', data=dataset)
<Axes: xlabel='risk', ylabel='results'>
sns.scatterplot(x='facility_type', y='risk', data=dataset);
graf_bar = sns.barplot(x='facility_type', y='count', data=inspections_by_type_stablishment)
sns.set_theme(style="darkgrid", context="paper")
graf_bar.set_title("Los 5 tipos de establecimiento más inspeccionados")
graf_bar.set_xlabel("Tipo de establecimiento")
graf_bar.set_ylabel("Total de inspecciones")
graf_bar.yaxis.set_major_formatter(FuncFormatter(number_formatter))
graf_bar.set_xticklabels(graf_bar.get_xticklabels(), rotation=90)
[Text(0, 0, 'Restaurant'), Text(1, 0, 'Grocery Store'), Text(2, 0, 'School'), Text(3, 0, "Children's Services Facility"), Text(4, 0, 'Bakery')]
dataset['year_inspection'] = dataset.inspection_date.dt.year
inspections_by_year = dataset.groupby(['year_inspection'], as_index=False)['inspection_id']\
.count()\
.rename(columns={'inspection_id': 'count'})\
.sort_values(by="year_inspection")
inspections_by_year
| year_inspection | count | |
|---|---|---|
| 0 | 2010 | 18002 |
| 1 | 2011 | 18668 |
| 2 | 2012 | 18796 |
| 3 | 2013 | 20893 |
| 4 | 2014 | 21485 |
| 5 | 2015 | 20852 |
| 6 | 2016 | 22747 |
| 7 | 2017 | 21507 |
| 8 | 2018 | 17117 |
| 9 | 2019 | 18971 |
| 10 | 2020 | 15086 |
| 11 | 2021 | 15809 |
| 12 | 2022 | 16866 |
| 13 | 2023 | 12034 |
graf_bar = sns.barplot(x="year_inspection", y="count", data=inspections_by_year)
graf_bar.set_title("Número de inspecciones por año")
graf_bar.set_ylabel('# inspecciones')
graf_bar.set_xlabel("Año")
Text(0.5, 0, 'Año')
menos = dataset.groupby(['inspection_type'], as_index=False)['inspection_id']\
.count()\
.sort_values(by="inspection_id", ascending=True)\
.rename(columns={'inspection_id': 'count'})\
.head(5)
menos
| inspection_type | count | |
|---|---|---|
| 0 | 1315 license reinspection | 1 |
| 78 | Summer Feeding | 1 |
| 77 | Special Task Force | 1 |
| 73 | Sample Collection | 1 |
| 71 | SMOKING COMPLAINT | 1 |
g = sns.barplot(x="inspection_type", y="count",data=menos)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
sns.set_theme(style="white", context="poster")
graf_bar = dataset.inspection_type.value_counts(normalize=True).sort_values(ascending=True).head(5)
graf_bar
ADDENDUM 0.000004 Duplicated 0.000004 finish complaint inspection from 5-18-10 0.000004 CLOSE-UP/COMPLAINT REINSPECTION 0.000004 LICENSE 0.000004 Name: inspection_type, dtype: float64
#b = pd.DataFrame({'inspection_type': a.index,
# 'prop': a})
#b
b = pd.DataFrame({'inspection_type': graf_bar.index,
'prop': graf_bar})
b
| inspection_type | prop | |
|---|---|---|
| ADDENDUM | ADDENDUM | 0.000004 |
| Duplicated | Duplicated | 0.000004 |
| finish complaint inspection from 5-18-10 | finish complaint inspection from 5-18-10 | 0.000004 |
| CLOSE-UP/COMPLAINT REINSPECTION | CLOSE-UP/COMPLAINT REINSPECTION | 0.000004 |
| LICENSE | LICENSE | 0.000004 |
g = sns.barplot(x="inspection_type", y="prop", data=b)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_ylim(0,1)
sns.set_theme(style="darkgrid", context="poster")
graf_bar = dataset.results.value_counts(normalize=True)*100
graf_bar
Pass 51.459822 Fail 19.507945 Pass w/ Conditions 15.347734 Out of Business 8.485780 No Entry 3.925697 Not Ready 1.242500 Business Not Located 0.030522 Name: results, dtype: float64
px.set_mapbox_access_token(open("/content/sample_data/mapbox_token").read())
# @title Texto de título predeterminado
fig = px.scatter_mapbox(dataset, lat="latitude", lon="longitude", hover_name="city", color="results",
hover_data=["aka_name", "facility_type"], zoom=9, height=400, opacity=0.3)
fig.show()